Analysis of > 13000 free text responses to the question in Survey 5
if_you_could_improve_one_thing_about_the_website_what_would_it_be?
library(tidyverse)
library(janitor)
library(myScrapers)
library(quanteda)
library(mclust)
library(parallel)
library(Rtsne)
library(dbscan)
library(quanteda.textmodels)
quanteda::quanteda_options(threads = 8)
c_survey <- read_csv("~/Documents/Covid-19 survey v0.05 (Responses) - Form Responses 1.csv") %>%
clean_names()
free_text_1 <- c_survey %>%
select(starts_with("if_you_could")) %>%
filter(!is.na(if_you_could_improve_one_thing_about_the_website_what_would_it_be)) %>%
mutate(pmid = row_number(),
absText = if_you_could_improve_one_thing_about_the_website_what_would_it_be,
title = pmid
)
dim(free_text_1)
c_2 <- create_abstract_corpus(free_text_1)
c_2
#create_abstract_cluster
## remove numbers
corpus <- c_2$corpus %>% filter(!is.na(pmid), str_detect(word, "[[:alpha:]]" ))
## extract ids
pmid_1 <- pull(corpus, "pmid") %>% unique() %>% enframe()
free_text_1 <- free_text_1 %>%
filter(pmid %in% pmid_1$value)
## reduce dimension
#tsne <- corpus %>% cast_sparse(pmid, word, tf_idf) %>% as.matrix() %>%
#Rtsne(check_duplicates = FALSE, perplexity = 50)
#tsne %>% saveRDS("tsne.rds")
## load
tsn <- read_rds("tsne.rds")
## dbscan method
set.seed(123)
## create cluster
clust <- hdbscan(tsn$Y, minPts = 60)
pairs(tsn$Y)
## combine datasets
clustering <- data.frame(cbind(pmid = pmid_1, tsn$Y, cluster = clust$cluster)) %>%
mutate(V2 = as.numeric(as.character(X1)), V3 = as.numeric(as.character(X2)))
clustering <- clustering %>% mutate(clustered = ifelse(cluster == 0, "not-clustered", "clustered"))
## create labels
labels <- create_cluster_labels(corpus, clustering, top_n = 6)
## classify statements
classification <- labels$results %>%
left_join(free_text_1, by = c("pmid.value" = "pmid"))
# classification %>%
# count(clus_names, sort = TRUE)
## recode categories
categorised <- classification %>%
mutate(broad_cat = case_when(str_detect(clus_names, "mobile|iPad") ~ "mobile",
str_detect(clus_names, "vaccin") ~ "vaccination-data",
str_detect(clus_names, "add") ~ "add-misc.",
str_detect(clus_names, "zoom") ~ "zooming issues",
#str_detect(clus_names, "ag") ~ "age-breakdowns",
str_detect(clus_names, "region") ~ "regional data",
str_detect(clus_names, "colour") ~ "colour-issues",
str_detect(clus_names, "compar|previou") ~ "comparisons",
str_detect(clus_names, "crash|fail|interact") ~ "map-stability",
str_detect(clus_names, "detail") ~ "detail",
str_detect(clus_names, "breakdown") ~ "breakdowns",
str_detect(clus_names, "excel") ~ "praise",
str_detect(clus_names, "death") ~ "deaths",
str_detect(clus_names, "time|consist") ~ "get-figures-out-on-time",
str_detect(clus_names, "local|msoa") ~ "granularity",
str_detect(clus_names, "home") ~ "homepage",
str_detect(clus_names, "date|frequent|lag") ~ "up-to-date|updates",
str_detect(clus_names, "link") ~ "awareness",
str_detect(clus_names, "easi|navig") ~ "navigation",
str_detect(clus_names, "graph") ~ "graph-issues",
str_detect(clus_names, "pcr") ~ "testing-date",
str_detect(clus_names, "post-code|postcod") ~ "post-code-issues",
str_detect(clus_names, "recov") ~ "recovery",
str_detect(clus_names, "rate") ~ "rates-R"))
library(treemap)
library(d3treeR)
### create scatter plot of responses
## calculcate median values of each cluster
cross_hairs <- classification %>%
group_by(clus_names) %>%
summarise(medX = median(V2),
medY = median(V3))
## plot scatter plot with colours for each cluster
classification %>%
as_tibble() %>%
ggplot(aes(V2, V3)) +
geom_jitter(aes(colour = clus_names), show.legend = FALSE, , alpha = 0.4) +
geom_point(aes(medX, medY), data = cross_hairs , shape = "X", size = 5) +
ggrepel::geom_text_repel(aes(label = clus_names, medX, medY), data = cross_hairs, size = 4) +
theme(axis.text = element_blank(),
panel.background = element_blank()) +
labs(title = "Estimated clustering of 13085 free text responses to\n'What one thing would you improve about the website'",
x = "",
y = ""
)
### table of responses in random samples from each cluster
classification %>%
group_by(clus_names) %>%
mutate(n = n()) %>%
sample_n(5) %>%
select(clus_names, absText, n) %>%
DT::datatable()
library(rsample)
## holdout unclustered data
heldout <- categorised %>%
filter(cluster == 0)
## training data
training <- categorised %>%
filter(cluster != 0)
## test train split
split <- initial_split(training, strata = broad_cat)
train <- training(split)
test <- testing(split)
## create a document term matrix, remove common words, numbers, punctuation, apply stemming
train_dfm <- corpus(train, text_field = "absText") %>%
dfm(., remove = stopwords("en"), remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)
#docvars(train_dfm)
test_dfm <- corpus(test, text_field = "absText") %>%
dfm(., remove = stopwords("en"), remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)
## build model
mod1 <- textmodel_svm(train_dfm, y = docvars(train_dfm, "broad_cat"))
## caret is modelling package - we are using to measure accuracy
library(caret)
## match features in test set to training set
dfm_match <- dfm_match(test_dfm, features = featnames(train_dfm))
## extract labels we are trying to predict
actual <- dfm_match$broad_cat
## predict labels
predicted <- predict(mod1, newdata = dfm_match)
## compare accuracy of predictions with labels
caret::confusionMatrix(table(actual, predicted)) %>%
tidy() %>%
filter(str_detect(term, "accuracy"))
## # A tibble: 17 x 6
## term class estimate conf.low conf.high p.value
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 accuracy <NA> 0.934 0.922 0.944 0
## 2 balanced_accuracy awareness 0.928 NA NA NA
## 3 balanced_accuracy colour-issues 1 NA NA NA
## 4 balanced_accuracy comparisons 0.948 NA NA NA
## 5 balanced_accuracy deaths 0.960 NA NA NA
## 6 balanced_accuracy get-figures-out-on-time 0.966 NA NA NA
## 7 balanced_accuracy granularity 0.948 NA NA NA
## 8 balanced_accuracy graph-issues 0.899 NA NA NA
## 9 balanced_accuracy map-stability 0.962 NA NA NA
## 10 balanced_accuracy navigation 0.938 NA NA NA
## 11 balanced_accuracy post-code-issues 0.986 NA NA NA
## 12 balanced_accuracy recovery 0.971 NA NA NA
## 13 balanced_accuracy regional data 0.958 NA NA NA
## 14 balanced_accuracy testing-date 0.983 NA NA NA
## 15 balanced_accuracy up-to-date|updates 0.970 NA NA NA
## 16 balanced_accuracy vaccination-data 0.979 NA NA NA
## 17 balanced_accuracy zooming issues 0.986 NA NA NA
held_dfm <- corpus(heldout, text_field = "absText") %>%
dfm(., remove = stopwords("en"), remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)
total_match <- dfm_match(held_dfm, features = featnames(train_dfm))
## predict labels
predict_total <- predict(mod1, newdata = total_match)
## add predictions
predicted_data <- data.frame(heldout, predicted = predict_total)
glimpse(predicted_data)
## Rows: 4,850
## Columns: 14
## $ pmid.name <int> 1, …
## $ pmid.value <int> 1, …
## $ X1 <dbl> -10…
## $ X2 <dbl> 11.…
## $ cluster <dbl> 0, …
## $ V2 <dbl> -10…
## $ V3 <dbl> 11.…
## $ clustered <chr> "no…
## $ clus_names <chr> "va…
## $ if_you_could_improve_one_thing_about_the_website_what_would_it_be <chr> "ra…
## $ absText <chr> "ra…
## $ title <int> 1, …
## $ broad_cat <chr> "va…
## $ predicted <fct> vac…
glimpse(training)
## Rows: 8,235
## Columns: 13
## $ pmid.name <int> 3, …
## $ pmid.value <int> 4, …
## $ X1 <dbl> 43.…
## $ X2 <dbl> -20…
## $ cluster <dbl> 5, …
## $ V2 <dbl> 43.…
## $ V3 <dbl> -20…
## $ clustered <chr> "cl…
## $ clus_names <chr> "lo…
## $ if_you_could_improve_one_thing_about_the_website_what_would_it_be <chr> "Th…
## $ absText <chr> "Th…
## $ title <int> 4, …
## $ broad_cat <chr> "ma…
pred <- predicted_data %>%
select(-broad_cat, broad_cat = predicted) %>%
bind_rows(training)
counts <- pred %>%
count(broad_cat, clus_names)
counts
## broad_cat
## 1 awareness
## 2 awareness
## 3 colour-issues
## 4 colour-issues
## 5 comparisons
## 6 comparisons
## 7 comparisons
## 8 deaths
## 9 deaths
## 10 get-figures-out-on-time
## 11 get-figures-out-on-time
## 12 get-figures-out-on-time
## 13 get-figures-out-on-time
## 14 granularity
## 15 granularity
## 16 granularity
## 17 granularity
## 18 granularity
## 19 granularity
## 20 granularity
## 21 granularity
## 22 graph-issues
## 23 graph-issues
## 24 map-stability
## 25 map-stability
## 26 map-stability
## 27 map-stability
## 28 map-stability
## 29 map-stability
## 30 map-stability
## 31 map-stability
## 32 map-stability
## 33 map-stability
## 34 map-stability
## 35 map-stability
## 36 map-stability
## 37 navigation
## 38 navigation
## 39 navigation
## 40 post-code-issues
## 41 post-code-issues
## 42 recovery
## 43 recovery
## 44 regional data
## 45 regional data
## 46 testing-date
## 47 testing-date
## 48 up-to-date|updates
## 49 up-to-date|updates
## 50 up-to-date|updates
## 51 vaccination-data
## 52 vaccination-data
## 53 vaccination-data
## 54 vaccination-data
## 55 vaccination-data
## 56 vaccination-data
## 57 vaccination-data
## 58 vaccination-data
## 59 vaccination-data
## 60 vaccination-data
## 61 vaccination-data
## 62 vaccination-data
## 63 zooming issues
## 64 zooming issues
## 65 zooming issues
## 66 zooming issues
## 67 zooming issues
## clus_names n
## 1 link-site-websit-inform-map-data 563
## 2 vaccin-local-graph-dai-map-data 630
## 3 colour-code-chang-rate-interact-map 209
## 4 vaccin-local-graph-dai-map-data 20
## 5 comparison-previou-week-compar-averag-data 70
## 6 previou-week-averag-dai-graph-data 245
## 7 vaccin-local-graph-dai-map-data 148
## 8 excess-covid-death-includ-dai-data 142
## 9 vaccin-local-graph-dai-map-data 164
## 10 figur-time-daili-graph-dai-data 126
## 11 real-lag-consist-time-report-map-data 133
## 12 regular-consist-updat-time-dai-data 161
## 13 vaccin-local-graph-dai-map-data 222
## 14 histor-select-level-author-download-local-map-data 184
## 15 inform-week-date-local-dai-map-data 80
## 16 localis-easi-provid-clearer-inform-data 103
## 17 post-code-search-local-data-map 119
## 18 roll-rate-infect-local-map-data 143
## 19 sourc-msoa-histor-download-displai-set-data 316
## 20 vaccin-local-graph-dai-map-data 852
## 21 week-date-report-local-graph-data 321
## 22 logarithm-size-larger-log-confus-axi-scale-clearer-vertic-graph 100
## 23 vaccin-local-graph-dai-map-data 8
## 24 constantli-lot-frequent-reload-phone-crash-map 209
## 25 devic-mobil-phone-interact-graph-map 186
## 26 doesn’t-ipad-reload-crash-interact-map 146
## 27 excel-it’-updat-interact-map-data 116
## 28 fail-frequent-reload-interact-map-data 113
## 29 interact-map 65
## 30 iphon-doesn’t-reload-io-crash-interact-map 109
## 31 lot-constantli-reload-crash-interact-map 84
## 32 reliabl-updat-crash-time-interact-map-data 67
## 33 speed-faster-fail-load-interact-map 142
## 34 stabil-improv-crash-interact-map-data 125
## 35 stabl-reload-uk-bit-map-interact 151
## 36 vaccin-local-graph-dai-map-data 654
## 37 navig-understand-easier-graph-map-data 185
## 38 touch-axi-easi-read-scale-graph-data 196
## 39 vaccin-local-graph-dai-map-data 186
## 40 rememb-town-search-postcod-map-data 256
## 41 vaccin-local-graph-dai-map-data 163
## 42 recov-discharg-hospit-peopl-covid-data 377
## 43 vaccin-local-graph-dai-map-data 92
## 44 breakdown-region-nation-compar-graph-data-map 131
## 45 vaccin-local-graph-dai-map-data 97
## 46 pcr-percentag-test-posit-peopl-data 239
## 47 vaccin-local-graph-dai-map-data 156
## 48 lag-averag-dai-date-map-graph-data 82
## 49 notif-quicker-frequent-healthcar-updat-data 181
## 50 vaccin-local-graph-dai-map-data 222
## 51 author-level-postcod-local-vaccin-data-map 76
## 52 axi-add-inform-vaccin-graph-local-data 138
## 53 brand-administ-project-prioriti-dose-type-vaccin 61
## 54 breakdown-band-ag-death-vaccin-data 363
## 55 detail-inform-local-graph-map-vaccin-data 186
## 56 home-page-summari-vaccin-graph-data 167
## 57 homepag-total-weekli-daili-vaccin-graph-data 153
## 58 manufactur-homepag-api-link-includ-vaccin-map-data 88
## 59 pfizer-dose-prioriti-progress-vaccin-data 106
## 60 popul-percentag-prioriti-total-peopl-vaccin-data 150
## 61 vaccin-local-graph-dai-map-data 892
## 62 variant-info-date-local-vaccin-updat 145
## 63 axi-function-abil-zoom-graph-date 108
## 64 bar-chart-zoom-graph-dai-data 71
## 65 constantli-stop-crash-zoom-interact-time-map 152
## 66 hard-devic-touch-mobil-phone-zoom-graph 96
## 67 vaccin-local-graph-dai-map-data 344
palette = viridis::viridis(20)
t <- treemap(counts,
index = c("broad_cat", "clus_names"),
vSize = "n",
type = "index",
palette = palette)
d3tree3(t, rootname = "Grouped answers")
predicted_data %>%
group_by(predicted) %>%
sample_n(3) %>%
select( absText, clus_names, predicted, broad_cat) %>%
DT::datatable()